getwd()
## [1] "/Users/tiffa.chua/Desktop/HS614/HS614_Data-Analysis"
For this HW use the diabetes dataset.
Exploratory Data Analysis: 1. Explore the data 2. How many rows and columns? 3. Any NAs? If yes, which columns? 4. Any strange values? If yes, what to do? 5. Any correlation between features? 6. Statistics of the columns and some exploratory plots (bar plot, histogram, boxplot …)
Build at least 4 different predictive models that can predict (classify) Diabetic patients. Compare the performance of the models.
Column definitions of this dataset are given in table 1 of this paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8306487/
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(caTools)
library(caret)
## Loading required package: lattice
library(class)
library(e1071)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## corrplot 0.90 loaded
library(ISLR)
library(cluster)
library(datasets)
library(pastecs)
library(psych)
library(corrplot)
library(stats)
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, units
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.3 ✓ purrr 0.3.4
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x tidyr::extract() masks pastecs::extract()
## x dplyr::filter() masks mice::filter(), stats::filter()
## x dplyr::first() masks pastecs::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks pastecs::last()
## x purrr::lift() masks caret::lift()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(rpart)
## Warning: package 'rpart' was built under R version 4.1.2
library(rpart.plot)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
library(animation)
diab <- read.csv("./diabetes.csv")
head(diab)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
str(diab)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
diab$Outcome = factor(diab$Outcome)
str(diab)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...
table(diab$Outcome)
##
## 0 1
## 500 268
nrow(diab)
## [1] 768
ncol(diab)
## [1] 9
sum(is.na(diab))
## [1] 0
missmap(diab)
#Note: While there are no NA variables, missing data points seem to have been replaced with a value of 0 in columns
Note: While there are no NA variables, missing data points seem to have been replaced with a value of 0 in columns.
sum(diab$Glucose == 0)
## [1] 5
sum(diab$BloodPressure == 0)
## [1] 35
sum(diab$SkinThickness == 0)
## [1] 227
sum(diab$Insulin == 0)
## [1] 374
sum(diab$BMI == 0)
## [1] 11
sum(diab$DiabetesPedigreeFunction == 0)
## [1] 0
sum(diab$Age == 0)
## [1] 0
#Strange Values:
#The following columns returned values of 0 Glucose, BloodPressure, SkinThickness, Insulin, and BMI; It is likely that missing/NA variables for these columns were replaced with 0, as 0 is a highly unlikely and unrealistic observation for these features
Strange Values The following columns returned values of 0: Glucose, BloodPressure, SkinThickness, Insulin, and BMI. It is likely that missing/NA variables for these columns were replaced with 0, as 0 is a highly unlikely and unrealistic observation for these features.
#What to do with the strange values?
diab1 <- diab
#Turn strange (0) values into NA
diab1$Glucose[diab1$Glucose == 0] = NA
diab1$BloodPressure[diab1$BloodPressure == 0] = NA
diab1$SkinThickness[diab1$SkinThickness == 0] = NA
diab1$Insulin[diab1$Insulin == 0] = NA
diab1$BMI[diab1$BMI == 0] = NA
sum(is.na(diab1))
## [1] 652
missmap(diab1)
#I will use mean imputation on the NA variables in the Predictive Models section (after Train/Test Split)
What to do with the strange variables? 1. Turn strange (0) values into NA. 2. I will use mean imputation on the NA variables in the Predicive Models section (after Train/Test Split).
stat.desc(diab1)
## Pregnancies Glucose BloodPressure SkinThickness Insulin
## nbr.val 768.0000000 7.630000e+02 7.330000e+02 5.410000e+02 3.940000e+02
## nbr.null 111.0000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## nbr.na 0.0000000 5.000000e+00 3.500000e+01 2.270000e+02 3.740000e+02
## min 0.0000000 4.400000e+01 2.400000e+01 7.000000e+00 1.400000e+01
## max 17.0000000 1.990000e+02 1.220000e+02 9.900000e+01 8.460000e+02
## range 17.0000000 1.550000e+02 9.800000e+01 9.200000e+01 8.320000e+02
## sum 2953.0000000 9.284700e+04 5.307300e+04 1.577200e+04 6.128600e+04
## median 3.0000000 1.170000e+02 7.200000e+01 2.900000e+01 1.250000e+02
## mean 3.8450521 1.216868e+02 7.240518e+01 2.915342e+01 1.555482e+02
## SE.mean 0.1215892 1.105464e+00 4.573454e-01 4.504407e-01 5.983841e+00
## CI.mean.0.95 0.2386871 2.170117e+00 8.978652e-01 8.848307e-01 1.176434e+01
## var 11.3540563 9.324254e+02 1.533178e+02 1.097672e+02 1.410770e+04
## std.dev 3.3695781 3.053564e+01 1.238216e+01 1.047698e+01 1.187759e+02
## coef.var 0.8763413 2.509364e-01 1.710120e-01 3.593740e-01 7.635951e-01
## BMI DiabetesPedigreeFunction Age Outcome
## nbr.val 7.570000e+02 768.00000000 7.680000e+02 NA
## nbr.null 0.000000e+00 0.00000000 0.000000e+00 NA
## nbr.na 1.100000e+01 0.00000000 0.000000e+00 NA
## min 1.820000e+01 0.07800000 2.100000e+01 NA
## max 6.710000e+01 2.42000000 8.100000e+01 NA
## range 4.890000e+01 2.34200000 6.000000e+01 NA
## sum 2.457030e+04 362.40100000 2.552900e+04 NA
## median 3.230000e+01 0.37250000 2.900000e+01 NA
## mean 3.245746e+01 0.47187630 3.324089e+01 NA
## SE.mean 2.516930e-01 0.01195579 4.243608e-01 NA
## CI.mean.0.95 4.941002e-01 0.02346996 8.330464e-01 NA
## var 4.795546e+01 0.10977864 1.383030e+02 NA
## std.dev 6.924988e+00 0.33132860 1.176023e+01 NA
## coef.var 2.133558e-01 0.70215138 3.537882e-01 NA
summary(diab1)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## Outcome
## 0:500
## 1:268
##
##
##
##
##
diab_num <- diab1[1:8]
M <- cor(diab_num,
method = "pearson")
M
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 NA NA NA
## Glucose NA 1 NA NA
## BloodPressure NA NA 1 NA
## SkinThickness NA NA NA 1
## Insulin NA NA NA NA
## BMI NA NA NA NA
## DiabetesPedigreeFunction -0.03352267 NA NA NA
## Age 0.54434123 NA NA NA
## Insulin BMI DiabetesPedigreeFunction Age
## Pregnancies NA NA -0.03352267 0.54434123
## Glucose NA NA NA NA
## BloodPressure NA NA NA NA
## SkinThickness NA NA NA NA
## Insulin 1 NA NA NA
## BMI NA 1 NA NA
## DiabetesPedigreeFunction NA NA 1.00000000 0.03356131
## Age NA NA 0.03356131 1.00000000
corrplot(M,
method = "square")
cor.plot(diab_num)
#Highest Correlating Variables:
# BMI & SkinThickness = 0.65
# Insulin & Glucose = 0.58
# Pregnancies & Age = 0.54
# BloodPressure & Age = 0.33
# BMI & BloodPressure = 0.29
Highest Correlating Variables: * BMI & SkinThickness = 0.65 * Insulin & Glucose = 0.58 * Pregnancies & Age = 0.54 * BloodPressure & Age = 0.33 * BMI & BloodPressure = 0.29
ggpairs(diab1)
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 227 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 40 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 232 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 375 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 229 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 39 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning: Removed 35 rows containing non-finite values (stat_boxplot).
## Warning: Removed 227 rows containing missing values (geom_point).
## Warning: Removed 232 rows containing missing values (geom_point).
## Warning: Removed 229 rows containing missing values (geom_point).
## Warning: Removed 227 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 229 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 227 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 227 rows containing missing values
## Warning: Removed 227 rows containing non-finite values (stat_boxplot).
## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 375 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 375 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning: Removed 374 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 39 rows containing missing values (geom_point).
## Warning: Removed 229 rows containing missing values (geom_point).
## Warning: Removed 375 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 227 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 227 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 227 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 374 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(diab1,
aes(Outcome)) +
geom_bar()
# Grouped
# p <- data %>%
# ggplot( aes(x=value, fill=type)) +
# geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
# scale_fill_manual(values=c("#69b3a2", "#404080")) +
# theme_ipsum() +
# labs(fill="")
Glucose_hist <- diab1 %>%
ggplot(aes(x = Glucose,
fill = Outcome)) +
geom_histogram(binwidth = 5)
Glucose_hist
## Warning: Removed 5 rows containing non-finite values (stat_bin).
BP_hist <- diab1 %>%
ggplot(aes(x = BloodPressure,
fill = Outcome)) +
geom_histogram(binwidth = 5)
BP_hist
## Warning: Removed 35 rows containing non-finite values (stat_bin).
BMI_hist <- diab1 %>%
ggplot(aes(x = BMI,
fill = Outcome)) +
geom_histogram(binwidth = 2)
BMI_hist
## Warning: Removed 11 rows containing non-finite values (stat_bin).
Ped_hist <- diab1 %>%
ggplot(aes(x = DiabetesPedigreeFunction,
fill = Outcome)) +
geom_histogram(binwidth = 0.075)
Ped_hist
Age_hist <- diab1 %>%
ggplot(aes(x = Age,
fill = Outcome)) +
geom_histogram(binwidth = 2)
Age_hist
#Highest Correlating Variables:
# BMI & SkinThickness = 0.65
# Insulin & Glucose = 0.58
# Pregnancies & Age = 0.54
# BloodPressure & Age = 0.33
# BMI & BloodPressure = 0.29
# BMI & SkinThickness = 0.65
BMI_Skin <- ggplot(data = diab1,
aes(x = BMI,
y = SkinThickness,
color = Outcome),
na.rm = TRUE) +
geom_point(size = 0.5) +
geom_smooth(method = lm) +
labs(title = "BMI vs. Skin Thickness")
BMI_Skin
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 229 rows containing non-finite values (stat_smooth).
## Warning: Removed 229 rows containing missing values (geom_point).
# Insulin & Glucose = 0.58
Insu_Glucose <- ggplot(data = diab1,
aes(x = Insulin,
y = Glucose,
color = Outcome),
na.rm = TRUE) +
geom_point(size = 0.5) +
geom_smooth(method = lm) +
labs(title = "Insulin vs. Glucose")
Insu_Glucose
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 375 rows containing non-finite values (stat_smooth).
## Warning: Removed 375 rows containing missing values (geom_point).
# Pregnancies & Age = 0.54
Age_Preg <- ggplot(data = diab1,
aes(x = Age,
y = Pregnancies,
color = Outcome),
na.rm = TRUE) +
geom_point(size = 0.5) +
geom_smooth(method = lm) +
labs(title = "Age vs. Pregnancies")
Age_Preg
## `geom_smooth()` using formula 'y ~ x'
# BloodPressure & Age = 0.33
Age_BP <- ggplot(data = diab1,
aes(x = Age,
y = BloodPressure,
color = Outcome),
na.rm = TRUE) +
geom_point(size = 0.5) +
geom_smooth(method = lm) +
labs(title = "Age vs. Blood Pressure")
Age_BP
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 35 rows containing non-finite values (stat_smooth).
## Warning: Removed 35 rows containing missing values (geom_point).
# Glucose & Age = 0.32
Age_Glucose <- ggplot(data = diab1,
aes(x = Age,
y = Glucose,
color = Outcome),
na.rm = TRUE) +
geom_point(size = 0.5) +
geom_smooth(method = lm) +
labs(title = "Age vs. Glucose")
Age_Glucose
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
# BloodPressure & BMI = 0.29
BMI_BP <- ggplot(data = diab1,
aes(x = BMI,
y = BloodPressure,
color = Outcome),
na.rm = TRUE) +
geom_point(size = 0.5) +
geom_smooth(method = lm) +
labs(title = "BMI vs. Blood Pressure")
BMI_BP
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 39 rows containing non-finite values (stat_smooth).
## Warning: Removed 39 rows containing missing values (geom_point).
set.seed(123)
split <- sample.split(diab1$Outcome,
SplitRatio = 0.8)
diab.train <- subset(diab1,
split == TRUE)
diab.test <- subset(subset(diab1,
split == FALSE))
nrow(diab.test)
## [1] 154
nrow(diab.train)
## [1] 614
head(diab.train)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 NA 33.6
## 2 1 85 66 29 NA 26.6
## 3 8 183 64 NA NA 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 8 10 115 NA NA NA 35.3
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 8 0.134 29 0
str(diab.train)
## 'data.frame': 614 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 10 8 4 10 10 ...
## $ Glucose : int 148 85 183 89 137 115 125 110 168 139 ...
## $ BloodPressure : int 72 66 64 66 40 NA 96 92 74 80 ...
## $ SkinThickness : int 35 29 NA 23 35 NA NA NA NA NA ...
## $ Insulin : int NA NA NA 94 168 NA NA NA NA NA ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 35.3 NA 37.6 38 27.1 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 29 54 30 34 57 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
nrow(diab.train)
## [1] 614
ncol(diab.train)
## [1] 9
sum(is.na(diab.train))
## [1] 536
sum(is.na(diab.test))
## [1] 116
diab.train$Glucose[is.na(diab.train$Glucose)] <- mean(diab.train$Glucose,
na.rm = TRUE)
diab.train$BloodPressure[is.na(diab.train$BloodPressure)] <- mean(diab.train$BloodPressure,
na.rm = TRUE)
diab.train$SkinThickness[is.na(diab.train$SkinThickness)] <- mean(diab.train$SkinThickness,
na.rm = TRUE)
diab.train$Insulin[is.na(diab.train$Insulin)] <- mean(diab.train$Insulin,
na.rm = TRUE)
diab.train$BMI[is.na(diab.train$BMI)] <- mean(diab.train$BMI,
na.rm = TRUE)
sum(is.na(diab.train))
## [1] 0
str(diab.train)
## 'data.frame': 614 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 10 8 4 10 10 ...
## $ Glucose : num 148 85 183 89 137 115 125 110 168 139 ...
## $ BloodPressure : num 72 66 64 66 40 ...
## $ SkinThickness : num 35 29 29.2 23 35 ...
## $ Insulin : num 154 154 154 94 168 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 29 54 30 34 57 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
diab.test$Glucose[is.na(diab.test$Glucose)] <- mean(diab.train$Glucose,
na.rm = TRUE)
diab.test$BloodPressure[is.na(diab.test$BloodPressure)] <- mean(diab.train$BloodPressure,
na.rm = TRUE)
diab.test$SkinThickness[is.na(diab.test$SkinThickness)] <- mean(diab.train$SkinThickness,
na.rm = TRUE)
diab.test$Insulin[is.na(diab.test$Insulin)] <- mean(diab.train$Insulin,
na.rm = TRUE)
diab.test$BMI[is.na(diab.test$BMI)] <- mean(diab.train$BMI,
na.rm = TRUE)
sum(is.na(diab.test))
## [1] 0
head(diab.test)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 6 5 116 74 29.17647 154.3137 25.6
## 7 3 78 50 32.00000 88.0000 31.0
## 9 2 197 70 45.00000 543.0000 30.5
## 14 1 189 60 23.00000 846.0000 30.1
## 17 0 118 84 47.00000 230.0000 45.8
## 22 8 99 84 29.17647 154.3137 35.4
## DiabetesPedigreeFunction Age Outcome
## 6 0.201 30 0
## 7 0.248 26 1
## 9 0.158 53 1
## 14 0.398 59 1
## 17 0.551 31 1
## 22 0.388 50 0
logModel1 <- glm(formula = Outcome ~ . ,
family = binomial,
data = diab.train)
summary(logModel1)
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = diab.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5108 -0.7124 -0.3915 0.6682 2.1827
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.799113 0.892047 -9.864 < 2e-16 ***
## Pregnancies 0.135168 0.037465 3.608 0.000309 ***
## Glucose 0.040167 0.004477 8.972 < 2e-16 ***
## BloodPressure -0.015105 0.009399 -1.607 0.108045
## SkinThickness 0.005799 0.015364 0.377 0.705834
## Insulin -0.002570 0.001424 -1.805 0.071029 .
## BMI 0.096264 0.020274 4.748 2.05e-06 ***
## DiabetesPedigreeFunction 0.559306 0.337462 1.657 0.097441 .
## Age 0.014412 0.010702 1.347 0.178086
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 793.94 on 613 degrees of freedom
## Residual deviance: 564.46 on 605 degrees of freedom
## AIC: 582.46
##
## Number of Fisher Scoring iterations: 5
#Based on logModel1, the significant values are as follows:
# ***: Glucose
# *: BMI
# .: Insulin
#AIC: 284.17
Based on logModel1, the significant values are as follows: * **: Glucose : BMI .: Insulin
AIC: 284.17
logModel2 <- glm(formula = Outcome ~ Glucose + BMI + Insulin,
family = binomial,
data = diab.train)
summary(logModel2)
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Insulin, family = binomial,
## data = diab.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2588 -0.7506 -0.4374 0.7348 2.3611
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.339728 0.733854 -11.364 <2e-16 ***
## Glucose 0.041933 0.004311 9.728 <2e-16 ***
## BMI 0.085804 0.016109 5.326 1e-07 ***
## Insulin -0.002365 0.001381 -1.713 0.0868 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 793.94 on 613 degrees of freedom
## Residual deviance: 595.13 on 610 degrees of freedom
## AIC: 603.13
##
## Number of Fisher Scoring iterations: 4
#Based on logModel2, the significant values are as follows:
# ***: Glucose, BMI
# .: Insulin
#AIC: 289.49
#Because this AIC is higher than for logModel1 (289.49 > 284.17), we can assume that logModel1 is better than logModel2
Based on logModel2, the significant values are as follows: * **: Glucose, BMI .: Insulin
AIC: 289.49 Because this AIC is higher than for logModel1 (289.49 > 284.17), we can assume that logModel1 is better than logModel2.
logModel3 <- glm(formula = Outcome ~ Glucose + BMI,
family = binomial,
data = diab.train)
summary(logModel3)
##
## Call:
## glm(formula = Outcome ~ Glucose + BMI, family = binomial, data = diab.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2748 -0.7531 -0.4407 0.7271 2.2838
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.235682 0.723816 -11.378 < 2e-16 ***
## Glucose 0.039075 0.003874 10.086 < 2e-16 ***
## BMI 0.082170 0.015837 5.189 2.12e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 793.94 on 613 degrees of freedom
## Residual deviance: 597.95 on 611 degrees of freedom
## AIC: 603.95
##
## Number of Fisher Scoring iterations: 4
#Based on logModel3, the significant values are as follows:
# ***: Glucose, BMI
#AIC: 590.81
#Because this AIC is higher than for logModel1 and logModel2 (590.81 > 289.49 > 284.17), we can assume that this model is not as good as logModel1 or logModel2
Based on logModel3, the significant values are as follows: * ***: Glucose, BMI
AIC: 590.81 Because this AIC is higher than for logModel1 and logModel2 (590.81 > 289.49 > 284.17), we can assume that this model is not as good as logModel1 or logModel2.
Outcome.predictions <- predict(logModel1,
diab.test,
type = "response")
log.results <- cbind(Outcome.predictions,
diab.test$Outcome)
results.class <- ifelse(log.results > 0.5, 1,0)
colnames(results.class) <- c("pred", "real")
results.class <- as.data.frame(results.class)
head(results.class)
## pred real
## 6 0 1
## 7 0 1
## 9 1 1
## 14 0 1
## 17 0 1
## 22 0 1
table(diab.test$Outcome,
results.class$pred)
##
## 0 1
## 0 84 16
## 1 24 30
head(diab.train)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72.00000 35.00000 154.3137 33.6
## 2 1 85 66.00000 29.00000 154.3137 26.6
## 3 8 183 64.00000 29.17647 154.3137 23.3
## 4 1 89 66.00000 23.00000 94.0000 28.1
## 5 0 137 40.00000 35.00000 168.0000 43.1
## 8 10 115 72.43027 29.17647 154.3137 35.3
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 8 0.134 29 0
head(diab.test)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 6 5 116 74 29.17647 154.3137 25.6
## 7 3 78 50 32.00000 88.0000 31.0
## 9 2 197 70 45.00000 543.0000 30.5
## 14 1 189 60 23.00000 846.0000 30.1
## 17 0 118 84 47.00000 230.0000 45.8
## 22 8 99 84 29.17647 154.3137 35.4
## DiabetesPedigreeFunction Age Outcome
## 6 0.201 30 0
## 7 0.248 26 1
## 9 0.158 53 1
## 14 0.398 59 1
## 17 0.551 31 1
## 22 0.388 50 0
str(diab.train)
## 'data.frame': 614 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 10 8 4 10 10 ...
## $ Glucose : num 148 85 183 89 137 115 125 110 168 139 ...
## $ BloodPressure : num 72 66 64 66 40 ...
## $ SkinThickness : num 35 29 29.2 23 35 ...
## $ Insulin : num 154 154 154 94 168 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 29 54 30 34 57 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
str(diab.test)
## 'data.frame': 154 obs. of 9 variables:
## $ Pregnancies : int 5 3 2 1 0 8 11 9 2 0 ...
## $ Glucose : num 116 78 197 189 118 99 143 102 90 180 ...
## $ BloodPressure : num 74 50 70 60 84 84 94 76 68 66 ...
## $ SkinThickness : num 29.2 32 45 23 47 ...
## $ Insulin : num 154 88 543 846 230 ...
## $ BMI : num 25.6 31 30.5 30.1 45.8 35.4 36.6 32.9 38.2 42 ...
## $ DiabetesPedigreeFunction: num 0.201 0.248 0.158 0.398 0.551 ...
## $ Age : int 30 26 53 59 31 50 51 46 27 25 ...
## $ Outcome : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 2 2 2 2 ...
sum(is.na(diab.train))
## [1] 0
sum(is.na(diab.test))
## [1] 0
#Fitting K-NN to the Training set and Predicting the Test set results
vec = c()
k_vec = c()
y_pred = knn(train = diab.train[ , -9],
test = diab.test[ , -9],
cl = diab.train[ , 9],
k = 10)
head(y_pred)
## [1] 0 0 1 1 0 0
## Levels: 0 1
#Fitting K-NN to the Training set and Predicting the Test set results
vec = c()
k_vec = c()
for (k in 1:50){
y_pred = knn(train = diab.train[ , -9],
test = diab.test[ , -9],
cl = diab.train[ , 9],
k = k)
error = mean(y_pred != diab.test$Outcome) #measure of accuracy
k_vec = c(k_vec, k)
vec = c(vec, error)
}
head(vec)
## [1] 0.3051948 0.3376623 0.3116883 0.3441558 0.3051948 0.2922078
df.error = data.frame(k_vec, vec)
head(df.error)
## k_vec vec
## 1 1 0.3051948
## 2 2 0.3376623
## 3 3 0.3116883
## 4 4 0.3441558
## 5 5 0.3051948
## 6 6 0.2922078
ggplot(df.error,
aes(k_vec, vec)) +
geom_line()
y_pred = knn(train = diab.train[, -9],
test = diab.test[, -9],
cl = diab.train[, 9],
k = 9)
# Making the Confusion Matrix
cm = table(diab.test[, 9], y_pred)
cm
## y_pred
## 0 1
## 0 84 16
## 1 23 31
confusionMatrix(diab.test[ , 9],
y_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 84 16
## 1 23 31
##
## Accuracy : 0.7468
## 95% CI : (0.6705, 0.8133)
## No Information Rate : 0.6948
## P-Value [Acc > NIR] : 0.09312
##
## Kappa : 0.4268
##
## Mcnemar's Test P-Value : 0.33667
##
## Sensitivity : 0.7850
## Specificity : 0.6596
## Pos Pred Value : 0.8400
## Neg Pred Value : 0.5741
## Prevalence : 0.6948
## Detection Rate : 0.5455
## Detection Prevalence : 0.6494
## Balanced Accuracy : 0.7223
##
## 'Positive' Class : 0
##
Kappa: 0.4268
head(diab.train)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72.00000 35.00000 154.3137 33.6
## 2 1 85 66.00000 29.00000 154.3137 26.6
## 3 8 183 64.00000 29.17647 154.3137 23.3
## 4 1 89 66.00000 23.00000 94.0000 28.1
## 5 0 137 40.00000 35.00000 168.0000 43.1
## 8 10 115 72.43027 29.17647 154.3137 35.3
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 8 0.134 29 0
head(diab.test)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 6 5 116 74 29.17647 154.3137 25.6
## 7 3 78 50 32.00000 88.0000 31.0
## 9 2 197 70 45.00000 543.0000 30.5
## 14 1 189 60 23.00000 846.0000 30.1
## 17 0 118 84 47.00000 230.0000 45.8
## 22 8 99 84 29.17647 154.3137 35.4
## DiabetesPedigreeFunction Age Outcome
## 6 0.201 30 0
## 7 0.248 26 1
## 9 0.158 53 1
## 14 0.398 59 1
## 17 0.551 31 1
## 22 0.388 50 0
str(diab.train)
## 'data.frame': 614 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 10 8 4 10 10 ...
## $ Glucose : num 148 85 183 89 137 115 125 110 168 139 ...
## $ BloodPressure : num 72 66 64 66 40 ...
## $ SkinThickness : num 35 29 29.2 23 35 ...
## $ Insulin : num 154 154 154 94 168 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 29 54 30 34 57 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
str(diab.test)
## 'data.frame': 154 obs. of 9 variables:
## $ Pregnancies : int 5 3 2 1 0 8 11 9 2 0 ...
## $ Glucose : num 116 78 197 189 118 99 143 102 90 180 ...
## $ BloodPressure : num 74 50 70 60 84 84 94 76 68 66 ...
## $ SkinThickness : num 29.2 32 45 23 47 ...
## $ Insulin : num 154 88 543 846 230 ...
## $ BMI : num 25.6 31 30.5 30.1 45.8 35.4 36.6 32.9 38.2 42 ...
## $ DiabetesPedigreeFunction: num 0.201 0.248 0.158 0.398 0.551 ...
## $ Age : int 30 26 53 59 31 50 51 46 27 25 ...
## $ Outcome : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 2 2 2 2 ...
sum(is.na(diab.train))
## [1] 0
sum(is.na(diab.test))
## [1] 0
tree = rpart(formula = Outcome ~ .,
method = "class",
data = diab.train)
#Predicting the Test Set Results
y_pred = predict(tree,
newdata = diab.test)
head(y_pred)
## 0 1
## 6 0.9491525 0.05084746
## 7 0.9298246 0.07017544
## 9 0.1500000 0.85000000
## 14 0.1500000 0.85000000
## 17 0.1666667 0.83333333
## 22 0.9298246 0.07017544
y_pred_class = ifelse(y_pred[, "0"] >= 0.5,
0, 1)
# Making the Confusion Matrix
cm = table(diab.test$Outcome,
y_pred_class)
cm
## y_pred_class
## 0 1
## 0 88 12
## 1 24 30
result.roc = roc(diab.test$Outcome,
y_pred[ ,"0"])
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
result.roc
##
## Call:
## roc.default(response = diab.test$Outcome, predictor = y_pred[, "0"])
##
## Data: y_pred[, "0"] in 100 controls (diab.test$Outcome 0) > 54 cases (diab.test$Outcome 1).
## Area under the curve: 0.7947
plot(result.roc)
plot(tree,
uniform=TRUE,
main="Diabetes Outcome")
text(tree,
use.n=TRUE,
all=TRUE)
prp(tree)
RF <- randomForest(formula = Outcome ~ .,
method='class',
data = diab.train)
RF
##
## Call:
## randomForest(formula = Outcome ~ ., data = diab.train, method = "class")
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 25.57%
## Confusion matrix:
## 0 1 class.error
## 0 340 60 0.150000
## 1 97 117 0.453271
importance(RF)
## MeanDecreaseGini
## Pregnancies 23.45711
## Glucose 71.64581
## BloodPressure 24.75545
## SkinThickness 19.84609
## Insulin 22.99613
## BMI 44.23679
## DiabetesPedigreeFunction 32.54496
## Age 37.69331
#MeanDecreaseGini --> Importance (IncNodePurity)
diab.test1 <- diab.test[-9]
head(diab.test1)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 6 5 116 74 29.17647 154.3137 25.6
## 7 3 78 50 32.00000 88.0000 31.0
## 9 2 197 70 45.00000 543.0000 30.5
## 14 1 189 60 23.00000 846.0000 30.1
## 17 0 118 84 47.00000 230.0000 45.8
## 22 8 99 84 29.17647 154.3137 35.4
## DiabetesPedigreeFunction Age
## 6 0.201 30
## 7 0.248 26
## 9 0.158 53
## 14 0.398 59
## 17 0.551 31
## 22 0.388 50
y_prob = predict(RF,
newdata = diab.test1,
type="prob")
head(y_prob)
## 0 1
## 6 0.920 0.080
## 7 0.922 0.078
## 9 0.288 0.712
## 14 0.308 0.692
## 17 0.328 0.672
## 22 0.696 0.304
# Predicting the Test set results
y_pred = predict(RF,
newdata = diab.test)
confusionMatrix(diab.test[ , 9],
y_pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 86 14
## 1 21 33
##
## Accuracy : 0.7727
## 95% CI : (0.6984, 0.8363)
## No Information Rate : 0.6948
## P-Value [Acc > NIR] : 0.02001
##
## Kappa : 0.4856
##
## Mcnemar's Test P-Value : 0.31049
##
## Sensitivity : 0.8037
## Specificity : 0.7021
## Pos Pred Value : 0.8600
## Neg Pred Value : 0.6111
## Prevalence : 0.6948
## Detection Rate : 0.5584
## Detection Prevalence : 0.6494
## Balanced Accuracy : 0.7529
##
## 'Positive' Class : 0
##
diab.train2 <- diab.train[1:8]
head(diab.train2)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72.00000 35.00000 154.3137 33.6
## 2 1 85 66.00000 29.00000 154.3137 26.6
## 3 8 183 64.00000 29.17647 154.3137 23.3
## 4 1 89 66.00000 23.00000 94.0000 28.1
## 5 0 137 40.00000 35.00000 168.0000 43.1
## 8 10 115 72.43027 29.17647 154.3137 35.3
## DiabetesPedigreeFunction Age
## 1 0.627 50
## 2 0.351 31
## 3 0.672 32
## 4 0.167 21
## 5 2.288 33
## 8 0.134 29
wcss2 = vector ()
for (i in 1:10){ #range of number of clusters, max limit can be any reasonable number
model = kmeans(diab.train2, i)
wcss2[i] = sum(model$withinss)
}
wcss = vector()
for (i in 1:10){
print(i)
model = kmeans(diab.train2, i)
wcss[i] = sum(model$withinss)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
plot(1:10,
wcss,
type = "b",
main = paste("The Elbow Method"),
xlab = "Number of Clusters",
ylab = "WCSS")
kmeans = kmeans(x = diab.train2,
centers = 2)
head(kmeans$cluster, 10)
## 1 2 3 4 5 8 10 11 12 13
## 1 1 1 2 1 1 1 1 1 1
head(kmeans$centers, 10)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 4.057816 127.1758 73.76700 29.84872 178.91355 32.70388
## 2 2.795918 104.6039 68.18367 27.04082 76.16327 31.50011
## DiabetesPedigreeFunction Age
## 1 0.4726959 34.72805
## 2 0.4697143 27.63946
wcss = kmeans$withinss
wcss
## [1] 3117899.1 258757.9
sum(kmeans$withinss)
## [1] 3376657
head(kmeans$tot.withinss, 10)
## [1] 3376657
y_kmeans = kmeans$cluster
clusplot(diab.train2,
y_kmeans,
lines = 0,
shade = FALSE,
color = TRUE,
labels = 2,
plotchar = FALSE,
span = TRUE,
main = "Clusters of Patients")
hc = hclust(d = dist(diab.train2,
method = "euclidean"),
method = "ward.D") #Variance within each cluster
plot(hc,
main = "Dendrogram",
xlab = "Patients",
ylab = "Euclidean Distances")
y_hc = cutree(hc, 2)
head(y_hc)
## 1 2 3 4 5 8
## 1 1 1 1 1 1
clusplot(diab.train2,
y_hc,
lines = 0,
shade = FALSE,
color = TRUE,
labels= 2,
plotchar = FALSE,
span = TRUE,
main = 'Clusters of Patients')
fviz_nbclust(diab.train2,
hcut,
method = "silhouette") +
labs(subtitle = "Silhouette method")
fviz_nbclust(diab.train2,
hcut,
nstart = 25,
method = "gap_stat",
nboot = 50) +
labs(subtitle = "Gap statistic method")